NOTE: This analysis requires at least 10Gb of RAM to run.
input_folder <- "raw_input"
output_folder <- "results"
output_format <- ".pdf"
matrix_plot_depth <- 6
color_range <- c(-4, 4)
NOTE: This analysis requires at least 10Gb of RAM to run.
The function parse_hmp_qiime is a wrapper for extract_taxonomy made for this analysis. It might be made more generic in the future. The information on sample characteristics (e.g. body site) is saved in a table called mapping in the taxmap object.
library(metacoder)
v35_data <- parse_hmp_qiime(otu_file = "http://downloads.hmpdacc.org/data/HMQCP/otu_table_psn_v35.txt.gz",
mapping_file = "http://downloads.hmpdacc.org/data/HMQCP/v35_map_uniquebyPSN.txt.bz2")
v35_data
## `taxmap` object with data for 985 taxa and 45383 observations:
##
## ----------------------------------------- taxa -----------------------------------------
## 1, 2, 3, 4, 5, 6, 7, 8, 9 ... 976, 977, 978, 979, 980, 981, 982, 983, 984, 985
##
## -------------------------------------- taxon_data --------------------------------------
## # A tibble: 985 x 4
## taxon_ids supertaxon_ids rank name
## <chr> <chr> <chr> <chr>
## 1 1 <NA> Root
## 2 2 1 p Acidobacteria
## 3 3 1 p Actinobacteria
## 4 4 1 p Armatimonadetes
## 5 5 1 p Bacteroidetes
## 6 6 1 p CCM11b
## 7 7 1 p Chloroflexi
## # ... with 978 more rows
##
## --------------------------------------- obs_data ---------------------------------------
## # A tibble: 45,383 x 4,790
## obs_taxon_ids otu_id 700114607 700114380 700114716 700114798 700114710 700114614 700114755
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 452 OTU_97.1 0 0 0 0 0 0 0
## 2 734 OTU_97.10 0 0 0 0 0 0 0
## 3 243 OTU_97.100 0 0 0 0 0 0 0
## 4 735 OTU_97.1000 0 0 0 0 0 0 0
## 5 351 OTU_97.10000 0 0 0 0 0 0 0
## 6 179 OTU_97.10001 0 0 0 0 0 0 0
## 7 734 OTU_97.10002 0 0 0 0 0 0 0
## # ... with 4.538e+04 more rows, and 4781 more variables: 700114715 <dbl>, 700114706 <dbl>,
## # 700114613 <dbl>, 700114803 <dbl>, 700114714 <dbl>, 700114482 <dbl>, 700114439 <dbl>,
## # 700114658 <dbl>, 700114387 <dbl>, 700114441 <dbl>, 700114608 <dbl>, 700111759 <dbl>,
## # 700114603 <dbl>, 700106988 <dbl>, 700113609 <dbl>, 700114612 <dbl>, 700110821 <dbl>,
## # 700114486 <dbl>, 700114481 <dbl>, 700114377 <dbl>, 700114424 <dbl>, 700114712 <dbl>,
## # 700114713 <dbl>, 700114554 <dbl>, 700114606 <dbl>, 700114335 <dbl>, 700114707 <dbl>,
## # 700114442 <dbl>, 700114610 <dbl>, 700114437 <dbl>, 700114327 <dbl>, 700114328 <dbl>,
## # 700114271 <dbl>, 700114223 <dbl>, 700114012 <dbl>, 700114117 <dbl>, 700111749 <dbl>,
## # 700113560 <dbl>, 700114709 <dbl>, 700108534 <dbl>, 700113020 <dbl>, 700114005 <dbl>,
## # 700114212 <dbl>, 700114274 <dbl>, 700114162 <dbl>, 700108165 <dbl>, 700113059 <dbl>,
## # 700114330 <dbl>, 700111035 <dbl>, 700110827 <dbl>, 700114051 <dbl>, 700105883 <dbl>,
## # 700114282 <dbl>, 700114279 <dbl>, 700113019 <dbl>, 700114750 <dbl>, 700114386 <dbl>,
## # 700113502 <dbl>, 700109121 <dbl>, 700111515 <dbl>, 700110308 <dbl>, 700114113 <dbl>,
## # 700113060 <dbl>, 700111520 <dbl>, 700114431 <dbl>, 700114006 <dbl>, 700111243 <dbl>,
## # 700114105 <dbl>, 700114056 <dbl>, 700110426 <dbl>, 700106490 <dbl>, 700110433 <dbl>,
## # 700114385 <dbl>, 700114438 <dbl>, 700111521 <dbl>, 700113016 <dbl>, 700111518 <dbl>,
## # 700111306 <dbl>, 700114440 <dbl>, 700113652 <dbl>, 700114048 <dbl>, 700111449 <dbl>,
## # 700110831 <dbl>, 700114112 <dbl>, 700111824 <dbl>, 700113017 <dbl>, 700114333 <dbl>,
## # 700114004 <dbl>, 700110654 <dbl>, 700114611 <dbl>, 700114210 <dbl>, 700114339 <dbl>,
## # 700111305 <dbl>, 700114170 <dbl>, 700111163 <dbl>, 700110657 <dbl>, 700110173 <dbl>,
## # 700114711 <dbl>, 700113116 <dbl>, 700111581 <dbl>, ...
##
## ------------------------------------- taxon_funcs -------------------------------------
## n_obs, n_obs_1, n_supertaxa, n_subtaxa, n_subtaxa_1, hierarchies
I will calculate total read abundance per taxon for later graphing and filtering.
calculate_abundance <- function(data) {
obs_samples <- colnames(data$obs_data) %in% data$mapping$sample_id
mutate_taxa(data,
abundance = vapply(obs(data),
function(i) sum(data$obs_data[i, obs_samples]), numeric(1)))
}
v35_data <- calculate_abundance(v35_data)
To get an idea of how the data looks overall I will make a plot showing OTU and read abundance of all the data combined.
plot_all <- function(data, output_name, seed = 1) {
set.seed(seed)
data %>%
filter_taxa(abundance >= 100) %>%
filter_taxa(name != "") %>% # Some taxonomic levels are not named
heat_tree(node_size = n_obs,
node_size_axis_label = "Number of OTUs",
node_color = abundance,
node_color_trans = "area",
node_color_axis_label = "Number of reads",
node_label = name,
node_label_max = 100,
overlap_avoidance = 1,
output_file = file.path(output_folder, paste0(output_name, output_format)))
}
plot_all(v35_data, "hmp--v35--all_data")
The HMP dataset is great for comparing treatment since there are many body sites with many replicates so statistical tests can be used to find real correlation between body sites and taxon abundance. The code below applies the Wilcox rank-sum test to differences in median read proportion between every pair of body sties compared. Since the data is compositional in nature (i.e. not idependent samples) we used a non-parametric test and used median instead of mean read proportion.
calculate_prop <- function(data, site) {
sample_cols <- as.character(unlist(data$mapping[data$mapping$body_site == site, "sample_id"]))
sample_cols <- sample_cols[sample_cols %in% colnames(data$obs_data)]
obs_indexes <- obs(data)
total_counts <- vapply(sample_cols, function(s) sum(data$obs_data[, s]), numeric(1))
col_name <- paste0(site, "_median_prop")
lapply(obs_indexes,
function(i) {
vapply(sample_cols,
function(s) sum(data$obs_data[i, s]) / total_counts[s],
numeric(1))})
}
calculate_prop_diff <- function(data, site_1, site_2) {
remove_inf <- function(values) {
values[values == Inf] <- 10000000000000000000000000
values[values == -Inf] <- -10000000000000000000000000
values[is.nan(values)] <- 0
return(values)
}
props_1 <- calculate_prop(data, site_1)
props_2 <- calculate_prop(data, site_2)
p_value <- mapply(function(x, y) wilcox.test(x, y)$p.value,
props_1, props_2)
p_value <- p.adjust(p_value, method = "fdr")
result <- ifelse(p_value > 0.05 | is.nan(p_value), 0,
log2(vapply(props_1, median, numeric(1)) / vapply(props_2, median, numeric(1))))
remove_inf(result)
}
plot_body_site_diff <- function(data, site_1, site_2, output_name, seed = 1) {
set.seed(seed)
data %>%
mutate_taxa(median_prop_diff = calculate_prop_diff(data, site_1, site_2)) %>%
filter_taxa(abundance >= 100) %>%
filter_taxa(name != "") %>% # Some taxonomic levels are not named
heat_tree(node_size_axis_label = "Number of OTUs",
node_size = n_obs,
node_color_axis_label = "Log 2 ratio of median proportions",
node_color = median_prop_diff,
node_color_range = diverging_palette(),
node_color_trans = "linear",
node_color_interval = color_range,
edge_color_interval = color_range,
node_label = name,
node_label_max = 150,
output_file = file.path(output_folder, paste0(output_name, "--", site_1, "_vs_", site_2, output_format)))
}
plot_body_site_diff(v35_data, "Throat", "Saliva", "hmp--v35")
plot_body_site_diff(v35_data, "Supragingival_plaque", "Subgingival_plaque", "hmp--v35")
plot_body_site_diff(v35_data, "Anterior_nares", "Buccal_mucosa", "hmp--v35")
Here is an example of a more conventional way of displaying the same data that does not supply taxonomic context.
library(ggplot2)
sites_used = c("Stool", "Saliva", "Tongue_dorsum", "Buccal_mucosa", "Anterior_nares")
plot_data <- v35_data
plot_data$taxon_data[sites_used] <- lapply(sites_used,
function(x) sapply(calculate_prop(plot_data, x), mean))
plot_data <- filter_taxa(plot_data, rank == "g")$taxon_data
plot_data <- plot_data[rev(order(plot_data$abundance)), ]
plot_data <- plot_data[1:10, c("name", sites_used)]
plot_data$y <- 10:1
plot_data <- tidyr::gather(plot_data, site, prop, -name, -y)
plot_data$x <- rep(seq_along(sites_used), each = 10)
bubble_plot <- ggplot(plot_data, aes(x = x, y = y, size = prop)) +
geom_point() +
scale_x_continuous(breaks = seq_along(sites_used), labels = sites_used, limits = c(1, 6)) +
scale_y_continuous(breaks = 1:10, labels = rev(unique(plot_data$name)), limits = c(1, 10)) +
guides(size = FALSE) +
ggtitle("Abundant Genera") +
theme(panel.grid = element_blank(),
panel.background = element_blank(),
axis.title = element_blank(),
axis.text.x = element_text(angle = -20, hjust = 0),
legend.background = element_blank())
ggsave(file.path(output_folder, "bubble_plot.pdf"), width = 5, height = 4)
print(bubble_plot)
The graphs above are great for comparing two treatments, but it would be nice to see how many treatments compare in a single graph. To do this, we have developed a pair-wise matrix layout for comparisons of this kind.
# Identify pairs of treatments to compare
body_sites <- c("Stool", "Saliva", "Tongue_dorsum", "Buccal_mucosa", "Anterior_nares")
combinations <- t(combn(seq_along(body_sites), 2))
site_diff_cols <- apply(combinations, MARGIN = 1, function(i) paste0(body_sites[i], collapse = "-"))
layout_matrix <- matrix(rep(NA, (length(body_sites))^2), nrow = length(body_sites))
for (index in 1:nrow(combinations)) {
layout_matrix[combinations[index,1], combinations[index,2]] <- index
}
# Make reduced taxonomy for plotting
v35_reduced <- v35_data
# Calculate significant body site differneces
not_used <- apply(combinations, MARGIN = 1, function(i) {
v35_reduced$taxon_data[[paste0(body_sites[i], collapse = "-")]] <<- calculate_prop_diff(v35_reduced, body_sites[i[1]], body_sites[i[2]])
})
# Remove data not needed for plotting
v35_reduced$obs_data <- v35_reduced$obs_data[, c("obs_taxon_ids", "otu_id")] # remove per-otu data since it is big
# Record taxa with no differences in any treatment
v35_reduced <- mutate_taxa(v35_reduced,
is_diff = apply(v35_reduced$taxon_data[ , site_diff_cols], MARGIN = 1,
function(a_row) any(a_row != 0)))
# Make individual plots
plot_body_site_diff_small <- function(data, diff_col, seed = 1) {
set.seed(seed)
data %>%
mutate_taxa(diff_col = data$taxon_data[[diff_col]]) %>%
filter_taxa(n_supertaxa <= matrix_plot_depth) %>%
filter_taxa(abundance > 1000) %>%
filter_taxa(name != "") %>% # Some taxonomic levels are not named
heat_tree(node_size = n_obs,
node_color = diff_col,
node_color_range = diverging_palette(),
node_color_trans = "linear",
node_color_interval = color_range,
edge_color_interval = color_range,
overlap_avoidance = 0.7,
node_size_axis_label = "Number of OTUs",
node_color_axis_label = "Median proportion difference",
make_legend = FALSE)
}
sub_plots <- lapply(site_diff_cols, function(x) plot_body_site_diff_small(v35_reduced, x))
key_plot <- v35_reduced %>%
filter_taxa(n_supertaxa <= matrix_plot_depth) %>%
filter_taxa(abundance > 1000) %>%
filter_taxa(name != "") %>% # Some taxonomic levels are not named
heat_tree(node_size = n_obs,
node_color = "#DDDDDD",
node_color_range = diverging_palette(),
node_color_trans = "linear",
node_color_interval = color_range,
node_label = ifelse(n_obs > 1000, name, NA),
edge_label = ifelse(n_obs > 1000, NA, name),
node_label_max = 500,
edge_label_max = 500,
edge_label_size_range = c(0.007, 0.02),
node_label_size_range = c(0.007, 0.02),
overlap_avoidance = 0.7,
node_size_axis_label = "Number of OTUs",
node_color_axis_label = "Log 2 ratio of median proportions",
make_legend = TRUE)
calc_subplot_coords <- function(a_matrix, x1 = 0, y1 = 0, x2 = 1, y2 = 1) {
# lowerleft = c(x1, y1), upperright = c(x2, y2)
x_coords <- seq(from = x1, to = x2, length.out = ncol(a_matrix) + 1)[- (ncol(a_matrix) + 1)]
y_coords <- seq(from = y1, to = y2, length.out = nrow(a_matrix) + 1)[- (nrow(a_matrix) + 1)]
do.call(rbind, lapply(1:ncol(a_matrix), function(x) data.frame(plot_index = a_matrix[, x],
x = x_coords[x],
y = rev(y_coords))))
}
# remove empty column/row
layout_matrix <- layout_matrix[! apply(layout_matrix, MARGIN = 1, function(x) all(is.na(x))), ]
layout_matrix <- layout_matrix[ ,! apply(layout_matrix, MARGIN = 2, function(x) all(is.na(x)))]
# Get subplot layout data
matrix_data <- calc_subplot_coords(layout_matrix, x1 = 0.2, y1 = 0.2, x2 = 0.95, y2 = 0.95)
matrix_data$site <- site_diff_cols[layout_matrix]
matrix_data <- tidyr::separate(matrix_data, site, c("site_1", "site_2"), "-")
matrix_data <- matrix_data[!is.na(matrix_data$plot_index), ]
matrix_data <- matrix_data[order(matrix_data$plot_index), ]
rownames(matrix_data) <- matrix_data$plot_index
# Make label data
named_row <- which(apply(layout_matrix, MARGIN = 1, function(x) all(!is.na(x))))
named_col <- which(apply(layout_matrix, MARGIN = 2, function(x) all(!is.na(x))))
horz_label_data <- matrix_data[match(layout_matrix[named_row, ], matrix_data$plot_index), ]
vert_label_data <- matrix_data[match(layout_matrix[, named_col], matrix_data$plot_index), ]
subgraph_width <- abs(horz_label_data$x[1] - horz_label_data$x[2])
subgraph_height <- abs(vert_label_data$y[1] - vert_label_data$y[2])
horz_label_data$label_x <- horz_label_data$x + subgraph_width / 2 # center of label
horz_label_data$label_y <- 0.96 # bottom of label
vert_label_data$label_x <- 0.96 # bottom of rotated label
vert_label_data$label_y <- vert_label_data$y + subgraph_height / 2 # center of rotated label
# Make plot
library(cowplot)
library(metacoder)
label_size <- 12
matrix_plot <- ggdraw() +
draw_plot(key_plot, x = 0, y = -0.03, width = 0.78, height = 0.67) +
draw_text(gsub("_", " ", horz_label_data$site_2),
x = horz_label_data$label_x, y = horz_label_data$label_y,
size = label_size, colour = diverging_palette()[1],
hjust = "center", vjust = "bottom") +
draw_text(gsub("_", " ", vert_label_data$site_1),
x = vert_label_data$label_x, y = vert_label_data$label_y,
size = label_size, colour = diverging_palette()[3],
hjust = "center", vjust = "bottom", angle = -90)
for (i in seq_along(sub_plots)) {
matrix_plot <- matrix_plot + draw_plot(sub_plots[[i]],
x = matrix_data[i, "x"],
y = matrix_data[i, "y"],
width = subgraph_width, height = subgraph_height)
}
print(matrix_plot)
save_plot(file.path(output_folder, "figure_3--hmp_matrix_plot.pdf"),
matrix_plot, base_width = 7.5, base_height = 7.5)
Comments